1 Preparation


1.1 Genotyped data set preparation


First we define input and output path shortcuts.

input <- file.path(getwd(), "data", "Input")
output <- file.path(getwd(), "data", "Output")

Then we delete the the output folder and all the files and folders it contains from the previous run.

unlink(output, recursive = TRUE)

The following code chunk consists in the arguments that this R script can take from the user. “verbose” means whether to output to the terminal, “imputated” is if the program should expect an imputated dataset “EP” performs needed data correction before analysis in the Epistasis Project dataset and “ADNI” means if the input dataset also includes the ADNI dataset.

verbose <- FALSE

Then, we proceed to read and store the genotyped input data.

encoded_NA <- c("00", "", "???", "-9")
df <- read.csv2(file.path(input, "genotyped.csv"),
                na.strings = encoded_NA)

df$DHFR_rs70991108_INDEL <- NULL # Deleting indel from the dataset
df$PPARG_rs709149 <- NULL # Genes with unclear genotyping were removed because of suspected error
df$MS4A4E_rs670139 <- NULL # Genes with unclear genotyping were removed because of suspected error
df <- df[rowSums(is.na(df)) != ncol(df), ] # Removing rows that are all NA's
str(df, list.len = 30)
## 'data.frame':    2441 obs. of  97 variables:
##  $ ID                : chr  "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
##  $ Plate             : chr  "AST5" "AST5" "AST5" "AST5" ...
##  $ WELL              : chr  "B01" "B03" "D01" "B05" ...
##  $ CentreCODE        : chr  "RG 728" "RG 733" "RG 606" "RG 743" ...
##  $ Diag              : chr  "AD" "AD" "AD" "AD" ...
##  $ SexLett           : chr  "Male" "Female" "Male" "Female" ...
##  $ Age               : int  69 69 81 76 75 75 80 74 71 76 ...
##  $ AgeExam           : num  69 69 81 76 75 75 80 74 71 76 ...
##  $ AgeOnset          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AgeDeath          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Centre_APOE       : int  33 34 33 33 33 33 23 33 23 33 ...
##  $ Centre            : chr  "OVIEDO" "OVIEDO" "OVIEDO" "OVIEDO" ...
##  $ rs429358          : chr  "TT" "CT" "TT" "TT" ...
##  $ rs7412            : chr  "CC" "CC" "CC" "CC" ...
##  $ LGC_APOE          : int  33 34 33 33 33 33 23 33 23 33 ...
##  $ LGC_E4.           : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FID               : chr  "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
##  $ IID               : chr  "ASR382AST5B01" "ASR383AST5B03" "ASR383AST5D01" "ASR384AST5B05" ...
##  $ PID               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MID               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sex               : int  1 2 1 2 2 1 1 2 2 1 ...
##  $ Dx                : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ SORT1_rs17585355  : chr  "AA" "AA" "AA" "AA" ...
##  $ SORT1_rs2228604   : chr  "CC" "CC" "AC" "AC" ...
##  $ SORT1_rs7536292   : chr  "TT" "CT" "TT" "TT" ...
##  $ SORT1_rs17646665  : chr  "AA" "AA" "AA" "AA" ...
##  $ SORT1_rs1149175   : chr  "GG" "GG" "GG" "GG" ...
##  $ SORT1_rs10745354  : chr  "TT" "TT" "CT" "CT" ...
##  $ IL10_rs1800896    : chr  "AA" "GA" "GG" "GA" ...
##  $ BIN1_rs744373     : chr  "GG" "GA" "GA" "GG" ...
##   [list output truncated]

Reading the snps that we want to study

vector_of_gene_snps <- scan(file.path(input, "snps.txt"),
                            what = "character",
                            quiet = !verbose)
vector_of_gene_snps
##  [1] "TCN2_rs5749131"     "TCN2_rs5753231"     "TCN2_rs16988828"   
##  [4] "TCN2_rs9606756"     "TCN2_rs757874"      "TCN2_rs1801198"    
##  [7] "TCN2_rs5749135"     "TCN2_rs9621049"     "TCN2_rs1131603"    
## [10] "TCN2_rs7289553"     "TCN2_rs5997711"     "MTRR_rs1801394"    
## [13] "MTRR_rs13181011"    "MTRR_rs326120"      "MTRR_rs1532268"    
## [16] "MTRR_rs7703033"     "MTRR_rs6555501"     "MTRR_rs162035"     
## [19] "MTRR_rs3815743"     "MTRR_rs162040"      "MTRR_rs10475399"   
## [22] "SORT1_rs17585355"   "SORT1_rs2228604"    "SORT1_rs7536292"   
## [25] "SORT1_rs17646665"   "SORT1_rs1149175"    "SORT1_rs10745354"  
## [28] "BDNF_rs6265"        "BDNF_rs11030102"    "BDNF_rs11030104"   
## [31] "BDNF_rs11030119"    "BDNF_rs12288512"    "SHMT1_rs16961153"  
## [34] "SHMT1_rs1979277"    "SHMT1_rs669340"     "SHMT1_rs643333"    
## [37] "SHMT1_rs9901160"    "RFC1_rs11096990"    "RFC1_rs4975002"    
## [40] "RFC1_rs4975009"     "RFC1_rs6851075"     "APOE_rs429358"     
## [43] "APOE_rs7412"        "APOE_rs597668"      "ABCA1_rs1800977"   
## [46] "ABCA1_rs2422493"    "CYP46A1_rs7157609"  "CYP46A1_rs4900442" 
## [49] "DBH_rs1611115"      "DBH_rs1611131"      "GAB2_rs4945261"    
## [52] "GAB2_rs2373115"     "MAPT_rs242557"      "MAPT_rs2471738"    
## [55] "NPC1_rs2236707"     "NPC1_rs4800488"     "NR1H2_rs2695121"   
## [58] "NR1H2_rs1052533"    "NTRK2_rs1187323"    "NTRK2_rs1545285"   
## [61] "PEMT_rs7946"        "PEMT_rs12325817"    "VDR_rs731236"      
## [64] "VDR_rs7975232"      "BIN1_rs744373"      "CD33_rs3865444"    
## [67] "CDK5_rs2069442"     "HMDX1_rs2071746"    "HMGCR_rs3931914"   
## [70] "IL10_rs1800896"     "LRP1_rs1799986"     "MTHFD1L_rs11754661"
## [73] "PRNP_rs1799990"     "SLC19A1_rs1051266"  "TRAK2_rs13022344"
length(vector_of_gene_snps)
## [1] 75

Chosing other variables of interest

variables <- c("ID", "Diag", "SexLett", "Age", "Centre", "LGC_E4.")

Subsetting the dataframe with the desired variables

df <- df[, c(variables, vector_of_gene_snps)]
dim(df)
## [1] 2441   81

After that, we order the genotypes so that in the heterozygotes are always sorted by alphabetical order (“A”, “C”, “G”, “T”).

df[,c(vector_of_gene_snps)] <- lapply(df[,c(vector_of_gene_snps)], order_heterozygotes)

Then we create a list of objects that for each SNP contains information about its name, gene it belongs, genotype counts, genotype frequencies, allele frequencies and counts, minor allele and major allele etc.

list_of_objects <- generate_object(exists = exists('list_of_objects'),
                                   dataset = df,
                                   snps = vector_of_gene_snps,
                                   verbose = verbose)
list_of_objects
## This is just a convenience function, invocated when print() is used, to provide an overview of the genes and snps present in this list. To view its contents, use list_of_objects$your_snp_id
## 
## 
## $snps
##  [1] "rs5749131"  "rs5753231"  "rs16988828" "rs9606756"  "rs757874"  
##  [6] "rs1801198"  "rs5749135"  "rs9621049"  "rs1131603"  "rs7289553" 
## [11] "rs5997711"  "rs1801394"  "rs13181011" "rs326120"   "rs1532268" 
## [16] "rs7703033"  "rs6555501"  "rs162035"   "rs3815743"  "rs162040"  
## [21] "rs10475399" "rs17585355" "rs2228604"  "rs7536292"  "rs17646665"
## [26] "rs1149175"  "rs10745354" "rs6265"     "rs11030102" "rs11030104"
## [31] "rs11030119" "rs12288512" "rs16961153" "rs1979277"  "rs669340"  
## [36] "rs643333"   "rs9901160"  "rs11096990" "rs4975002"  "rs4975009" 
## [41] "rs6851075"  "rs429358"   "rs7412"     "rs597668"   "rs1800977" 
## [46] "rs2422493"  "rs7157609"  "rs4900442"  "rs1611115"  "rs1611131" 
## [51] "rs4945261"  "rs2373115"  "rs242557"   "rs2471738"  "rs2236707" 
## [56] "rs4800488"  "rs2695121"  "rs1052533"  "rs1187323"  "rs1545285" 
## [61] "rs7946"     "rs12325817" "rs731236"   "rs7975232"  "rs744373"  
## [66] "rs3865444"  "rs2069442"  "rs2071746"  "rs3931914"  "rs1800896" 
## [71] "rs1799986"  "rs11754661" "rs1799990"  "rs1051266"  "rs13022344"
## 
## $genes
##  [1] "TCN2"    "MTRR"    "SORT1"   "BDNF"    "SHMT1"   "RFC1"    "APOE"   
##  [8] "ABCA1"   "CYP46A1" "DBH"     "GAB2"    "MAPT"    "NPC1"    "NR1H2"  
## [15] "NTRK2"   "PEMT"    "VDR"     "BIN1"    "CD33"    "CDK5"    "HMDX1"  
## [22] "HMGCR"   "IL10"    "LRP1"    "MTHFD1L" "PRNP"    "SLC19A1" "TRAK2"  
## 
## 
## The list has a total of 75 snps and 28 genes

Each of the SNP objects present in the list_of_objects have the following structure:

list_of_objects[[1]]
## ###### rs5749131 ######
## 
## [gene] TCN2
## 
## [id] rs5749131
## 
## [geno_count] AA = 173;  AG = 478;  GG = 288;
## 
## [geno_freq] AA = 18.424;  AG = 50.905;  GG = 30.671;
## 
## [allele_count] A = 824;  G = 1054;
## 
## [allele_freq] A = 43.876;  G = 56.124;
## 
## [minor_allele] A
## 
## [major_allele] G

We can extract just the reference snp id from this object

vector_of_snps <- sapply(list_of_objects, function(x) x$id)
vector_of_snps <- unname(vector_of_snps)
vector_of_snps
##  [1] "rs5749131"  "rs5753231"  "rs16988828" "rs9606756"  "rs757874"  
##  [6] "rs1801198"  "rs5749135"  "rs9621049"  "rs1131603"  "rs7289553" 
## [11] "rs5997711"  "rs1801394"  "rs13181011" "rs326120"   "rs1532268" 
## [16] "rs7703033"  "rs6555501"  "rs162035"   "rs3815743"  "rs162040"  
## [21] "rs10475399" "rs17585355" "rs2228604"  "rs7536292"  "rs17646665"
## [26] "rs1149175"  "rs10745354" "rs6265"     "rs11030102" "rs11030104"
## [31] "rs11030119" "rs12288512" "rs16961153" "rs1979277"  "rs669340"  
## [36] "rs643333"   "rs9901160"  "rs11096990" "rs4975002"  "rs4975009" 
## [41] "rs6851075"  "rs429358"   "rs7412"     "rs597668"   "rs1800977" 
## [46] "rs2422493"  "rs7157609"  "rs4900442"  "rs1611115"  "rs1611131" 
## [51] "rs4945261"  "rs2373115"  "rs242557"   "rs2471738"  "rs2236707" 
## [56] "rs4800488"  "rs2695121"  "rs1052533"  "rs1187323"  "rs1545285" 
## [61] "rs7946"     "rs12325817" "rs731236"   "rs7975232"  "rs744373"  
## [66] "rs3865444"  "rs2069442"  "rs2071746"  "rs3931914"  "rs1800896" 
## [71] "rs1799986"  "rs11754661" "rs1799990"  "rs1051266"  "rs13022344"

And then we can convert the categorical variables in the dataframe into the factor datatype.

df[-c(1,4)] <- lapply(df[-c(1,4)], as.factor)


1.2 Imputated data set preparation


First we read and store the the imputated imput data.

encoded_NA <- c("#NULL!", "NA")
imput_df <- read.csv2(file.path(input, "imputated.csv"), 
                      na.strings = encoded_NA)
str(imput_df, list.len = 30)
## 'data.frame':    6357 obs. of  93 variables:
##  $ id                : int  3406 4010 12046 3784 2210 8831 1106 5426 3578 8432 ...
##  $ sex               : chr  "Woman" "Woman" "Woman" "Woman" ...
##  $ age.at.entry      : num  58 58 59 58 60 60 55 57 57 57 ...
##  $ Age.to.Use        : num  60 60 60 60 60 60 60 61 61 61 ...
##  $ Age75             : chr  "lessthan75" "lessthan75" "lessthan75" "lessthan75" ...
##  $ prevalent_dementia: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ incident_dementia : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dx                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DiagName          : chr  "CTRL" "CTRL" "CTRL" "CTRL" ...
##  $ ageatdementia     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ followup          : num  2.0589 1.8563 1.2238 1.9439 0.0493 ...
##  $ E4status          : chr  "E4+" "2" "E4-" "E4-" ...
##  $ apoe              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ rs429358          : num  1.017 0.981 1.999 2 1.994 ...
##  $ rs7412            : num  1 1.05 1 2 2 ...
##  $ rs670139          : num  1 2 2 1 2 2 0 2 1 2 ...
##  $ rs242557          : int  2 NA 1 NA 1 0 0 1 1 1 ...
##  $ rs1052533         : int  1 NA 0 1 1 1 2 1 0 0 ...
##  $ rs2471738         : int  2 1 0 0 0 0 0 0 1 1 ...
##  $ rs4975002         : num  0.001 1.001 1.999 1 0.999 ...
##  $ rs1131603         : num  2 2 1.08 2 1.66 ...
##  $ rs7975232         : num  0 2 0 1.07 2 ...
##  $ rs2695121         : num  1 2 1.03 2 1.01 ...
##  $ rs3865444         : num  1.999 0.141 1.005 2 0.784 ...
##  $ rs4975009         : num  0.006 1 2 0 0.808 ...
##  $ rs1800977         : num  1 2 0 0 1 2 1 2 1 1 ...
##  $ rs709149          : num  1 2 1 1 1 0 2 0 1 1 ...
##  $ rs12288512        : num  1 2 2 1 1 ...
##  $ rs4945261         : int  2 1 0 1 2 2 2 2 2 1 ...
##  $ rs7946            : num  0.008 1 0 0 0 0 0 1 0 0 ...
##   [list output truncated]

Then we subset the data of interest.

imput_variables <- c("id", "DiagName", "sex", "Age.to.Use", "E4status")
imput_df <- imput_df[,c(imput_variables, vector_of_snps)]
dim(imput_df)
## [1] 6357   80

Then, we import data from the allele schema used in the imputation, in order to reconvert it back to genotypes.

imput_scheme <- read.csv(file.path(input, "imputation_scheme.csv"), sep = ";")
imput_scheme <- imput_scheme[imput_scheme$SNP %in% vector_of_snps,]
colnames(imput_scheme) <- c("snp_id", "reference", "alternative")

There is still some snp that is not found in the reference scheme. It seems some of the snps in the dataset were directly genotyped and codified as dummy allele dosage data, so we will use the human reference genome GRCh37 to recodify them.

genotyped_snps <- setdiff(vector_of_snps, imput_scheme$snp_id)
genotyped_snps
## [1] "rs1052533"
GRCh37 <- read.table(file.path(input, "GRCh37_snps.txt"), quote = "\"", comment.char = "")
colnames(GRCh37) <- c("chromosome", "location", "snp_id", "reference", "alternative")
index <- nrow(imput_scheme)
for (snp in genotyped_snps) {
  i <- which(genotyped_snps == snp)
  imput_scheme[index + i,] <- GRCh37[GRCh37$snp_id == snp,][,c(3,4,5)]
}
row.names(imput_scheme) <- NULL

So the final dataframe used to recodify the imputated data into genotypes will be the following one:

imput_scheme
snp_id reference alternative
rs10475399 A G
rs1051266 T C
rs1052533 G A
rs10745354 T C
rs11030102 C G
rs11030104 A G
rs11030119 G A
rs11096990 C T
rs1131603 T C
rs1149175 G A
rs11754661 G A
rs1187323 A C
rs12288512 G A
rs12325817 C G
rs13022344 T C
rs13181011 T C
rs1532268 C T
rs1545285 T G
rs1611115 C T
rs1611131 A G
rs162035 G A
rs162040 A C
rs16961153 G A
rs16988828 A G
rs17585355 A C
rs17646665 A G
rs1799986 C T
rs1799990 A G
rs1800896 T C
rs1800977 G A
rs1801198 C G
rs1801394 A G
rs1979277 G A
rs2069442 C G
rs2071746 A T
rs2228604 G T
rs2236707 C T
rs2373115 C A
rs2422493 G A
rs242557 A G
rs2471738 T C
rs2695121 C T
rs326120 A G
rs3815743 A G
rs3865444 C A
rs3931914 C G
rs429358 T C
rs4800488 A C
rs4900442 C T
rs4945261 G A
rs4975002 T G
rs4975009 T C
rs5749131 G A
rs5749135 C T
rs5753231 C T
rs597668 T C
rs5997711 C T
rs6265 C T
rs643333 G T
rs6555501 T C
rs669340 C G
rs6851075 T C
rs7157609 G A
rs7289553 A G
rs731236 A G
rs7412 C T
rs744373 A G
rs7536292 T C
rs757874 G T
rs7703033 G A
rs7946 C T
rs7975232 C A
rs9606756 A G
rs9621049 C T
rs9901160 G A

First, though the imputation data is converted to the numeric datatype.

imput_df[, 6:ncol(imput_df)] <- lapply(imput_df[,6:ncol(imput_df)], 
                                       function(x) as.numeric(x))

Then we proceed to the recoding of the imputation data into genotyped one.

imput_df <- genotype_imputated_df(list_of_objects = list_of_objects, 
                                  df = imput_df, 
                                  scheme = imput_scheme,
                                  match_strands = T,
                                  verbose = verbose)

Then again like we did with the genotyped dataset, we order the genotypes so that in the heterozygotes are always sorted by alphabetical order (“A”, “C”, “G”, “T”).

imput_df[,c(vector_of_snps)] <- lapply(imput_df[,c(vector_of_snps)], order_heterozygotes)

And we convert data types from characters to factors

imput_df[-c(1,4)] <- lapply(imput_df[-c(1,4)], as.factor)


1.3 Merging imputated and genotyped datasets


Renaming variables names in the datasets

colnames(df) <- c("ID", "Diag", "Sex", "Age_to_use", "Centre", "E4status", vector_of_snps)
colnames(imput_df) <- c("ID", "Diag", "Sex", "Age_to_use", "E4status", vector_of_snps)

As we are studying LOAD, subset cases higher or equal than 60 years old in the datasets

df_old <- df
imput_df_old <- imput_df
df <- df[df$Age_to_use >= 60,]
imput_df <- imput_df[imput_df$Age_to_use >= 60,]

And we can see that 81 cases were removed from the genotype dataset, and 0 from the imputated.

nrow(df_old) - nrow(df)
## [1] 81
# 81

nrow(imput_df_old) - nrow(imput_df)
## [1] 0
# 0

We can calculate the median of the combined population

median(c(df$Age_to_use, imput_df$Age_to_use))
## [1] 82
# 82

Renaming the levels of E4status in the genotyped dataset.

levels(df$E4status)
## [1] "0" "1"
levels(df$E4status) <- c("E4-", "E4+")

And the levels of Diagnosis in the imputated dataset.

levels(imput_df$Diag)
## [1] "CTRL"       "dementiaAD"
levels(imput_df$Diag) <- c("Control", "AD")

We also create a binary variable in both datasets that tells whether the age of the individual is equal or above the median of the population. In this case, as we saw earlier it is 82.

df$Age82 <- ifelse(df$Age_to_use >= 82, ">82", "<82")
imput_df$Age82 <- ifelse(imput_df$Age_to_use >= 82, ">82", "<82")

We found a case of unknown gender in the imputated dataset, so we remove it and reset the levels. And then rename them as “Male” and “Female”.

table(imput_df$Sex)
## 
##     1   Man Woman 
##     1  2528  3828
imput_df <- imput_df[imput_df$Sex != 1,]
imput_df$Sex <- factor(imput_df$Sex) 
levels(imput_df$Sex) <- c("Male", "Female")

And we create a new variable “Centre” to keep track that this cases are from the center of Rotterdam.

imput_df$Centre <- "ROTTERDAM"

The last step before merging is extracting the data from Rotterdam to fill up the list_of_objects created in a previous step.

list_of_objects <- generate_object(exists = exists('list_of_objects'), 
                                   dataset = imput_df, 
                                   snps = vector_of_snps)

Then, the final structure obtained would be similar to the following one:

str(list_of_objects)
## This is just a convenience function, invocated when str() is used, to provide an overview of the list elements. To view its contents, use list_of_objects$your_snp_id
## 
## An example of the structure of the objects in the list is the following one:
## 
## rs5749131 : List of 14
##  $ gene              : chr "TCN2"
##  $ id                : chr "rs5749131"
##  $ geno_count        : 'table' int [1:3(1d)] 173 478 288
##  $ geno_freq         : 'table' num [1:3(1d)] 18.4 50.9 30.7
##  $ allele_count      : Named num [1:2] 824 1054
##  $ allele_freq       : Named num [1:2] 43.9 56.1
##  $ minor_allele      : chr "A"
##  $ major_allele      : chr "G"
##  $ imput_geno_count  : 'table' int [1:3(1d)] 934 2336 1527
##  $ imput_geno_freq   : 'table' num [1:3(1d)] 19.5 48.7 31.8
##  $ imput_allele_count: Named num [1:2] 4204 5390
##  $ imput_allele_freq : Named num [1:2] 43.8 56.2
##  $ imput_minor_allele: chr "A"
##  $ imput_major_allele: chr "G"

Once we’ve done that, we can proceed with the merging.

matching_variables <- c("ID", "Diag", "Sex", "E4status", "Age_to_use", "Age82", "Centre", vector_of_snps)
All <- merge(x = df, y = imput_df, by = matching_variables, all = T)
str(All, list.len = 30)
## 'data.frame':    8716 obs. of  82 variables:
##  $ ID        : chr  "10" "100" "1000" "10000" ...
##  $ Diag      : Factor w/ 2 levels "AD","Control": 2 2 1 2 2 2 2 2 2 2 ...
##  $ Sex       : Factor w/ 2 levels "Female","Male": 2 1 2 1 1 2 1 2 2 1 ...
##  $ E4status  : Factor w/ 3 levels "E4-","E4+","2": 1 1 2 1 1 1 1 1 2 1 ...
##  $ Age_to_use: num  82 77 82 78 92 78 82 83 86 87 ...
##  $ Age82     : chr  ">82" "<82" ">82" "<82" ...
##  $ Centre    : Factor w/ 7 levels "BRISTOL","MADRID",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ rs5749131 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 2 3 2 2 2 2 2 ...
##  $ rs5753231 : Factor w/ 3 levels "CC","CT","TT": 2 1 1 2 1 2 2 1 1 2 ...
##  $ rs16988828: Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 1 1 1 1 ...
##  $ rs9606756 : Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 1 2 1 2 ...
##  $ rs757874  : Factor w/ 3 levels "GG","GT","TT": 1 1 2 2 1 2 2 1 1 1 ...
##  $ rs1801198 : Factor w/ 3 levels "CC","CG","GG": 3 1 2 2 1 2 2 2 2 2 ...
##  $ rs5749135 : Factor w/ 3 levels "CC","CT","TT": 1 2 2 2 3 2 2 1 2 1 ...
##  $ rs9621049 : Factor w/ 3 levels "CC","CT","TT": 1 2 1 1 1 1 1 2 1 2 ...
##  $ rs1131603 : Factor w/ 3 levels "CC","CT","TT": 3 3 3 3 2 3 3 3 3 3 ...
##  $ rs7289553 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 2 3 2 2 1 2 2 ...
##  $ rs5997711 : Factor w/ 3 levels "CC","CT","TT": 3 1 2 2 1 2 2 2 2 2 ...
##  $ rs1801394 : Factor w/ 3 levels "AA","AG","GG": 3 2 1 3 3 3 1 2 3 3 ...
##  $ rs13181011: Factor w/ 3 levels "CC","CT","TT": 1 3 3 3 2 2 3 3 2 2 ...
##  $ rs326120  : Factor w/ 3 levels "AA","AG","GG": 1 2 1 1 1 1 3 2 1 1 ...
##  $ rs1532268 : Factor w/ 3 levels "AA","AG","GG": 1 3 2 3 2 2 3 3 2 2 ...
##  $ rs7703033 : Factor w/ 3 levels "AA","AG","GG": 3 2 3 1 2 2 3 2 2 2 ...
##  $ rs6555501 : Factor w/ 3 levels "CC","CT","TT": 3 2 3 1 2 2 3 2 2 2 ...
##  $ rs162035  : Factor w/ 3 levels "AA","AG","GG": 3 3 3 2 3 3 3 3 3 3 ...
##  $ rs3815743 : Factor w/ 3 levels "AA","AG","GG": 1 1 2 1 1 1 1 1 1 1 ...
##  $ rs162040  : Factor w/ 3 levels "AA","AC","CC": 1 2 1 1 1 1 3 1 1 1 ...
##  $ rs10475399: Factor w/ 3 levels "AA","AG","GG": 3 2 2 2 3 3 1 2 3 3 ...
##  $ rs17585355: Factor w/ 3 levels "AA","AC","CC": 1 1 1 1 1 1 1 1 1 2 ...
##  $ rs2228604 : Factor w/ 3 levels "AA","AC","CC": 3 3 2 2 1 2 3 2 2 2 ...
##   [list output truncated]

Before doing any analysis, we should make sure the contrasts are performed with respect to the controls, not the Alzheimer cases.

All$Diag <- relevel(All$Diag, ref = "Control")

Adding binary variables to the merged dataset according to if the data belong to a specific region, being 0 not belonging and 1 belonging.

centres <- levels(All$Centre)
centres <- stringr::str_to_title(centres)
All[centres] <- 0
for (centre in centres) {
  All[[centre]][All$Centre == toupper(centre)] <- 1
}

Then we divide the dataset into regions according to the centres.

All$Region <- ifelse(All$Centre == "MADRID" | All$Centre == "OVIEDO" | All$Centre == "SANTANDER", "Spain", "N.Eur")


1.4 Quality control


We will perform a quality control of the samples before proceeding to the analysis step. An example of typical quality control procedures usually performed can be found here


1.4.1 Formatting dataset


Once we’ve merged both datasets we can proceed with quality control procedures.

We’ve previosly prepared a map file with the snps under study. It was generated with the GRCh38 genome assembly version.

map <- read.delim(file.path(input, "raw_data.map"), header = FALSE)
colnames(map) <- c("Chr", "SNP", "GD", "BPP")

The file has the following structure:

map
Chr SNP GD BPP
1 rs10745354 0 109389286
1 rs1149175 0 109379755
1 rs17585355 0 109315193
1 rs17646665 0 109369429
1 rs1800896 0 206773552
1 rs2228604 0 109342153

We can see that the snps ids are exactly the same found in our dataset so we already got all the information we need for the map file.

setdiff(map$SNP, names(list_of_objects))
## character(0)
setdiff(names(list_of_objects), map$SNP)
## character(0)

Now, we take care of the ped file. First, we create an empty ped file structure that we will fill up with the data.

ped <- data.frame(matrix(ncol = 6, nrow = nrow(All)))
colnames(ped) <- c("FID", "IID", "PID", "MID", "Sex", "P")

Then we introduce the columns values for each of the defined variables

ped$FID <- All$ID #It is the same plink does when the flag --no-fid is used
ped$IID <- All$ID
ped$PID <- 0
ped$MID <- 0
ped$Sex <- sapply(All$Sex, function(x) {if (is.na(x)) {0} else if (x == "Male") {1} else if (x == "Female") {2} else {0}})
ped$P <- sapply(All$Diag, function(x) {if (is.na(x)) {-9} else if (x == "Control") {1} else if (x == "AD") {2} else {-9}})

If we take look we can see that the variables have been coded correctly

head(All[,c("ID", "Sex", "Diag")])
##      ID    Sex    Diag
## 1    10   Male Control
## 2   100 Female Control
## 3  1000   Male      AD
## 4 10000 Female Control
## 5 10004 Female Control
## 6 10005   Male Control
head(ped[,c("IID", "Sex", "P")])
##     IID Sex P
## 1    10   1 1
## 2   100   2 1
## 3  1000   1 2
## 4 10000   2 1
## 5 10004   2 1
## 6 10005   1 1

Now, then the final step would be to add the genotypes, and as the version of plink we are using (PLINK v1.90) parses the genotypes, we don’t need to split the columns we have into allele columns and we can just bind it directly to our ped file.

ped <- cbind(ped, All[,map$SNP])

But we have yet some missing values remaining in our dataset.

colSums(is.na(ped))
##        FID        IID        PID        MID        Sex          P rs10745354 
##          0          0          0          0          0          0        468 
##  rs1149175 rs17585355 rs17646665  rs1800896  rs2228604  rs7536292 rs13022344 
##        455        459        447        460        469        493        471 
##   rs744373 rs11096990  rs4975002  rs4975009  rs6851075 rs10475399 rs13181011 
##        493        796        491        467        488        485        452 
##  rs1532268   rs162035   rs162040  rs1801394   rs326120  rs3815743  rs3931914 
##        483        457        477        471        486        494        468 
##  rs6555501  rs7703033 rs11754661  rs2069442  rs1187323  rs1545285  rs1611115 
##        485        471        479        493        478        484        457 
##  rs1611131  rs1800977  rs2422493 rs11030102 rs11030104 rs11030119 rs12288512 
##        841        466        483        492        477        491        464 
##  rs2373115  rs4945261     rs6265  rs1799986   rs731236  rs7975232  rs4900442 
##        462        468        456        502        497        478        458 
##  rs7157609 rs12325817 rs16961153  rs1979277   rs242557  rs2471738   rs643333 
##        509        506        461        480        330        347        477 
##   rs669340     rs7946  rs9901160  rs2236707  rs4800488  rs1052533  rs2695121 
##        455        488        459        471        463        473        482 
##  rs3865444   rs429358   rs597668     rs7412  rs1799990  rs1051266  rs1131603 
##        468        580        485        469        475        472        452 
## rs16988828  rs1801198  rs2071746  rs5749131  rs5749135  rs5753231  rs5997711 
##        455        472        457        490        457        482        475 
##  rs7289553   rs757874  rs9606756  rs9621049 
##        497        476        464        466

And we convert the missing values to 00, as plink expects.

ped[,map$SNP] <- lapply(ped[,map$SNP], as.character)
ped[is.na(ped)] <- "00"
sum(is.na(ped))
## [1] 0

Then we can save the ped file in order to use it as input for plink.

write.table(ped, file = file.path(input, "raw_data.ped"), quote = F, sep = "\t", row.names = F, col.names = F)


1.4.3 Making binary files


The first step is to create a folder where to store the output that we will obtain in all subsequent steps using the program.

mkdir -p "$PWD"/data/Output/plink/0-Data
run_shell('mkdir %CD%/data/Output/plink/0-Data')

The next step step will be to convert the files obtained into binary files to speed up the computation.

plink --file "$PWD"/data/Input/raw_data --make-bed --out "$PWD"/data/Output/plink/0-Data/binary_data
run_shell('plink --file %CD%/data/Input/raw_data --make-bed --out %CD%/data/Output/plink/0-Data/binary_data', ignore.stdout = !verbose)


1.4.4 Removing missingness


We first filter SNPs and individuals based on a relaxed threshold (0.2; >20%), as this will filter out SNPs and individuals with very high levels of missingness. Then a filter with a more stringent threshold will be applied (0.02). We do first SNP filtering before individual filtering.

mkdir -p "$PWD"/data/Output/plink/1-QC/1-Missingness
plink --bfile "$PWD"/data/Output/plink/0-Data/binary_data --missing --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing
plink --bfile "$PWD"/data/Output/plink/0-Data/binary_data --geno 0.2 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_1
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_1 --mind 0.2 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_2
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_2 --geno 0.02 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_3
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_3 --mind 0.02 --make-bed --out "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_4
run_shell('mkdir %CD%/data/Output/plink/1-QC/1-Missingness')
run_shell('plink --bfile %CD%/data/Output/plink/0-Data/binary_data --missing --out %CD%/data/Output/plink/1-QC/1-Missingness/missing', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/0-Data/binary_data --geno 0.2 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_1', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_1 --mind 0.2 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_2', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_2 --geno 0.02 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_3', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_3 --mind 0.02 --make-bed --out %CD%/data/Output/plink/1-QC/1-Missingness/missing_4', ignore.stdout = !verbose)


1.4.5 Removing SNPs with lower MAF than threshold


SNPs with a low MAF are rare, therefore power is lacking for detecting SNP‐phenotype associations. These SNPs are also more prone to genotyping errors. The MAF threshold depends on sample size, larger samples can use lower MAF thresholds. As the sample size in this study is moderate (N = 8716), we use the 0.05 as MAF threshold.

mkdir "$PWD"/data/Output/plink/1-QC/2-MAF
plink --bfile "$PWD"/data/Output/plink/1-QC/1-Missingness/missing_4 --maf 0.05 --make-bed --out "$PWD"/data/Output/plink/1-QC/2-MAF/minor_allele
run_shell('mkdir %CD%/data/Output/plink/1-QC/2-MAF')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/1-Missingness/missing_4 --maf 0.05 --make-bed --out %CD%/data/Output/plink/1-QC/2-MAF/minor_allele', ignore.stdout = !verbose)


1.4.6 Looking at deviations from Hardy-Weinberg equilibrium


Deviations from HWE is a common indicator of genotyping error, but it may also indicate evolutionary selection. We will use a threshold of 10^-6 for controls and 10^-10 for cases.

mkdir "$PWD"/data/Output/plink/1-QC/3-HWE
plink --bfile "$PWD"/data/Output/plink/1-QC/2-MAF/minor_allele --hwe 1e-6 --make-bed --out "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg --hwe 1e-10 include-nonctrl --make-bed --out "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1
run_shell('mkdir %CD%/data/Output/plink/1-QC/3-HWE')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/2-MAF/minor_allele --hwe 1e-6 --make-bed --out %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg', ignore.stdout = !verbose)
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg --hwe 1e-10 include-nonctrl --make-bed --out %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1', ignore.stdout = !verbose)


1.4.7 Removing individuals with a heterozygosity rate deviating more than 3SD from the mean


Deviations can indicate sample contamination or inbreeding. Heterozygosity checks are performed on a set of SNPs that are not highly correlated. To generate this list of non-(highly)correlated SNPs, we exclude high inversion regions.

mkdir "$PWD"/data/Output/plink/1-QC/4-Het
plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --exclude "$PWD"/data/Input/inversion.txt --range --indep-pairwise 50 5 0.2 --out "$PWD"/data/Output/plink/1-QC/4-Het/indepSNP
run_shell('mkdir %CD%/data/Output/plink/1-QC/4-Het')
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --exclude %CD%/data/Input/inversion.txt --range --indep-pairwise 50 5 0.2 --out %CD%/data/Output/plink/1-QC/4-Het/indepSNP', ignore.stdout = !verbose)

Once we’ve done that we can compute the heterozygosity rates

plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --extract "$PWD"/data/Output/plink/1-QC/4-Het/indepSNP.prune.in --het --out "$PWD"/data/Output/plink/1-QC/4-Het/Het_check
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --extract %CD%/data/Output/plink/1-QC/4-Het/indepSNP.prune.in --het --out %CD%/data/Output/plink/1-QC/4-Het/Het_check', ignore.stdout = !verbose)

From this file we can generate a list of individuals who deviate more than 3 standard deviations from the heterozigosity rate mean.

# All credit to Andries Marees at https://github.com/MareesAT/GWA_tutorial
het <- read.table(file.path(output, "plink", "1-QC", "4-Het", "Het_check.het"), head = TRUE)
het$HET_RATE <-  (het$"N.NM." - het$"O.HOM.")/het$"N.NM."
het_fail <-  subset(het, (het$HET_RATE < mean(het$HET_RATE) - 3*sd(het$HET_RATE)) | (het$HET_RATE > mean(het$HET_RATE) + 3*sd(het$HET_RATE)))
het_fail$HET_DST <-  (het_fail$HET_RATE - mean(het$HET_RATE))/sd(het$HET_RATE)
write.table(het_fail[,1:2], file.path(output, "plink", "1-QC", "4-Het", "fail-het-qc.txt"), row.names = FALSE, quote = F)

Now, we proceed to remove those heterozygosity rate outliers

plink --bfile "$PWD"/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --remove "$PWD"/data/Output/plink/1-QC/4-Het/fail-het-qc.txt --make-bed --out "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity
run_shell('plink --bfile %CD%/data/Output/plink/1-QC/3-HWE/hardy_weinberg_1 --remove %CD%/data/Output/plink/1-QC/4-Het/fail-het-qc.txt --make-bed --out %CD%/data/Output/plink/1-QC/4-Het/heterozigosity', ignore.stdout = !verbose)


1.4.8 Population structure analysis


We can also check for population substrure in our dataset, for instance by performing a PCA.

This additional analysis is only actually performed on Unix computers, if in windows I have just provided the commands used and and example of the output obtained. It also isn’t run by default, because of the bottleneck caused by this additional analysis. The results obtained are shown as recorded on a .png file of the PCA plot. You can control if the analysis should be performed by setting the run_PCA variable as TRUE or FALSE.

run_PCA <- FALSE

For that, we downloaded a 1000 genomes file of 629 individuals which were genotyped in blablabla SNPs. This file was downloaded from here: ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz.

After downloading it the file was converted to plink files, and performed QC as described in the Population Stratification tutorial in https://github.com/MareesAT/GWA_tutorial.


1.4.8.1 Removing unshared SNPs


Just the snps present in both datasets were kept to be able to make the comparison. Due to file size, of the thousand genomes files, we’ve already extracted the relevant SNPs from the thousand genomes plink files. To do so for the EP dataset we might do:

mkdir -p "$PWD"/data/Output/plink/2-POP/
awk '{print$2}' "$PWD"/data/Input/1KG_PCA/1KG.bim > "$PWD"/data/Output/plink/2-POP/1KG_snps.txt
plink --bfile "$PWD"/data/Output/plink/1-QC/4-Het/heterozigosity --extract "$PWD"/data/Output/plink/2-POP/1KG_snps.txt --make-bed --recode --out "$PWD"/data/Output/plink/2-POP/EP
cat "$PWD"/data/Output/plink/2-POP/1KG_snps.txt | wc -l


1.4.8.2 Update build


Then, the next step would be to check that both datasets have the same build and if not update the older 1KG file accordingly.

awk '{print$2,$4}' "$PWD"/data/Output/plink/2-POP/EP.map > "$PWD"/data/Output/plink/2-POP/buildEP.txt
plink --bfile "$PWD"/data/Input/1KG_PCA/1KG --update-map "$PWD"/data/Output/plink/2-POP/buildEP.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/1KG_PCA


1.4.8.3 Prepare merging


In order to merge both files, first we have to ascertain that both files refer to the same reference and alternative alleles.

awk '{print$2,$5}' "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim > "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt
plink --bfile "$PWD"/data/Output/plink/2-POP/EP --reference-allele "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_adj
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/EP_adj.bim > "$PWD"/data/Output/plink/2-POP/EP_adj_tmp
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim > "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp
sort "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp "$PWD"/data/Output/plink/2-POP/EP_adj_tmp | uniq -u > "$PWD"/data/Output/plink/2-POP/all_differences.txt
cat "$PWD"/data/Output/plink/2-POP/all_differences.txt

Those differences might be just caused by looking at different strands so we will flip those bases and see if the error persists.

awk '{print$1}' "$PWD"/data/Output/plink/2-POP/all_differences.txt | sort -u > "$PWD"/data/Output/plink/2-POP/flip_list.txt
plink --bfile "$PWD"/data/Output/plink/2-POP/EP_adj --flip "$PWD"/data/Output/plink/2-POP/flip_list.txt --reference-allele "$PWD"/data/Output/plink/2-POP/1KG_ref-list.txt --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_adj_2

## Now check if they are still problematic
awk '{print$2,$5,$6}' "$PWD"/data/Output/plink/2-POP/EP_adj_2.bim > "$PWD"/data/Output/plink/2-POP/EP_adj_2_tmp
sort "$PWD"/data/Output/plink/2-POP/1KG_PCA_tmp "$PWD"/data/Output/plink/2-POP/EP_adj_2_tmp | uniq -u > "$PWD"/data/Output/plink/2-POP/still_differences.txt
cat "$PWD"/data/Output/plink/2-POP/still_differences.txt


1.4.8.4 Merge


And we can see that there are no remaining differences anymore so we shouldn’t remove any SNP and we can proceed to merge both files.

plink --bfile "$PWD"/data/Output/plink/2-POP/EP_adj_2 --bmerge "$PWD"/data/Output/plink/2-POP/1KG_PCA.bed "$PWD"/data/Output/plink/2-POP/1KG_PCA.bim "$PWD"/data/Output/plink/2-POP/1KG_PCA.fam --allow-no-sex --make-bed --out "$PWD"/data/Output/plink/2-POP/EP_1KG_PCA


1.4.8.5 Labeling the superpopulations


Before doing the PCA we can label the data according to different human superpopulations (AFR, EUR etc) in order to be able to keep track of them when we plot them graphically. For that we get a file with population information from the 1000 genomes and transform the subpopulation codes in our files in superpopulation ones.

# Note: for macOS/BSD users use GNU sed instead of default sed, linux users may use sed directly instead of gsed
wget -P "$PWD"/data/Output/plink/2-POP/ ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/20100804.ALL.panel
## rename populations
awk '{print$1,$1,$2}' "$PWD"/data/Output/plink/2-POP/20100804.ALL.panel > "$PWD"/data/Output/plink/2-POP/pop_1kG.txt
gsed 's/JPT/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG2.txt
gsed 's/ASW/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG2.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG3.txt
gsed 's/CEU/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG3.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG4.txt
gsed 's/CHB/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG4.txt > "$PWD"/data/Output/plink/2-POP/pop_1kG5.txt
gsed 's/CHD/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG5.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG6.txt
gsed 's/YRI/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG6.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG7.txt
gsed 's/LWK/AFR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG7.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG8.txt
gsed 's/TSI/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG8.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG9.txt
gsed 's/MXL/AMR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG9.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG10.txt
gsed 's/GBR/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG10.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG11.txt
gsed 's/FIN/EUR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG11.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG12.txt
gsed 's/CHS/ASN/g' "$PWD"/data/Output/plink/2-POP/pop_1kG12.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG13.txt
gsed 's/PUR/AMR/g' "$PWD"/data/Output/plink/2-POP/pop_1kG13.txt>"$PWD"/data/Output/plink/2-POP/pop_1kG14.txt

# Create our own population file
awk '{print$1,$2,"EP"}' "$PWD"/data/Output/plink/2-POP/EP_adj_2.fam > "$PWD"/data/Output/plink/2-POP/pop_EP.txt

# Concatenate population files.
cat "$PWD"/data/Output/plink/2-POP/pop_1kG14.txt "$PWD"/data/Output/plink/2-POP/pop_EP.txt | gsed -e '1i\FID IID SUPERPOP' > "$PWD"/data/Output/plink/2-POP/popfile.txt

# remove temporary files from above
rm -f "$PWD"/data/Output/plink/2-POP/pop_*.txt


1.4.8.6 PCA


With the data merged we can perform PCA on plink.

plink --bfile "$PWD"/data/Output/plink/2-POP/EP_1KG_PCA --pca --out "$PWD"/data/Output/plink/2-POP/PCA_results

We will plot the results using a small Python script (all credit to Xavier Farré)

# engine.opts='-l' loads files that are normally loaded when you start the shell like for instance ~/.bash_profile. In this case, python 3 is aliased to python, instead of using python 2.7.
python --version
python "$PWD"/data/Input/1KG_PCA/plot_PCA.py
PCA performed on the EP dataset

PCA performed on the EP dataset


1.5 Defining inheritance models


The next step is to obtaining all possible snp inheritance model combinations and adding the new columns to the dataset.

recessive <- paste0(vector_of_snps, "r")
additive <- paste0(vector_of_snps, "a")
dominant <- paste0(vector_of_snps, "d")
inheritance <- c(recessive, additive, dominant)
All[inheritance] <- NA

Then we define the variables (columns) order in the dataset.

col_order <- c("ID", "Diag", "Sex", "Age_to_use", "Age82", "Region",  "E4status", "Centre", centres, vector_of_snps, inheritance)
 All <- All[,col_order]

Finally we code the inheritance with numbers depending on the genotype values for each snp. (0 reference, 1 risk allele effect, 2 two risk alleles effect {just for the additive model})

All <- generate_inheritances(snps = list_of_objects, data = All)


1.6 Diagnose possible problems and perform data conversions


After a quick look at the data we decide to codify an unexplained level in the E4status variable (coming from Rotterdam dataset) into missing data.

All$E4status[All$E4status == 2] <- NA

Then we convert all categorical variables into the factor class.

factors <- -c(1,4)
All[,factors] <- lapply(All[,factors], factor)

The next step to be performed is to do some further data exploration by creating contingency tables of age, sex and E4status by diagnosis and centre.

variables <- c("Age82", "Sex", "E4status", "Diag", "Centre", "Region")
list_of_tables <- vector(mode = "list", length = 2)
i = 2 # Two pairs of variables cross tabulation
while (i < 4) { 
  # Generate all combinations of variables possible, taking i at a time.
  combinations <- combn(variables, i, simplify = FALSE)
  list_of_tables[[i - 1]] <- character(length = length(combinations))
  # For combination in combinations
  for (n in seq_along(combinations)) {
    combination <- combinations[[n]]
    # Create the formula to input to the xtabs function
    formula <- paste0("All$", combination)
    formula <- paste(formula, collapse = " + ")
    formula <- paste0("~", formula)
    # Create the name of the table as variable1_variable2
    name <- paste(combination, collapse = "_")
    assign(name, xtabs(formula))
    list_of_tables[[i - 1]][n] <- name
  }
  # Once all possible 2 variables combinations have been performed, do it
  # for three variables
  i = i + 1
}
names(list_of_tables) <- c("Two variables", "Three variables")

We can take a quick look at an overview of the contingency tables created by printing the list_of_tables object.

list_of_tables
## $`Two variables`
##  [1] "Age82_Sex"       "Age82_E4status"  "Age82_Diag"      "Age82_Centre"   
##  [5] "Age82_Region"    "Sex_E4status"    "Sex_Diag"        "Sex_Centre"     
##  [9] "Sex_Region"      "E4status_Diag"   "E4status_Centre" "E4status_Region"
## [13] "Diag_Centre"     "Diag_Region"     "Centre_Region"  
## 
## $`Three variables`
##  [1] "Age82_Sex_E4status"     "Age82_Sex_Diag"         "Age82_Sex_Centre"      
##  [4] "Age82_Sex_Region"       "Age82_E4status_Diag"    "Age82_E4status_Centre" 
##  [7] "Age82_E4status_Region"  "Age82_Diag_Centre"      "Age82_Diag_Region"     
## [10] "Age82_Centre_Region"    "Sex_E4status_Diag"      "Sex_E4status_Centre"   
## [13] "Sex_E4status_Region"    "Sex_Diag_Centre"        "Sex_Diag_Region"       
## [16] "Sex_Centre_Region"      "E4status_Diag_Centre"   "E4status_Diag_Region"  
## [19] "E4status_Centre_Region" "Diag_Centre_Region"

Another step we can perform for the sake of error checking is creating summary tables of mean, min, max age by diagnosis and centre.

First, we give variables shorter names for ease of use.

ages <- All$Age_to_use
diagnostic <- All$Diag
centres <- All$Centre
regions <- All$Region

Then we create the summary tables.

### Only Age by diagnosis
variable <- ages
group <- diagnostic
AgeAll <- createSummary(variable, group)

### Age of alzheimer patients by Centre
variable <- ages[diagnostic == "AD"]
group <- centres[diagnostic == "AD"]
AgeAD <- createSummary(variable, group)

### Age of control patients by Centre
variable <- ages[diagnostic == "Control"]
group <- centres[diagnostic == "Control"]
AgeControl <- createSummary(variable, group)

### Age of alzheimer patients by Region (N.Eur or Spain)
variable <- ages[diagnostic == "AD"]
group <- regions[diagnostic == "AD"]
AgeADRegion <- createSummary(variable, group)

### Age of control patients by Region (N.Eur or Spain)
variable <- ages[diagnostic == "Control"]
group <- regions[diagnostic == "Control"]
AgeControlRegion <- createSummary(variable, group)

summary_tables <- c("AgeAll", "AgeAD", "AgeControl", "AgeADRegion", "AgeControlRegion")

for (summary in summary_tables) {
  print(summary)
  print(get(summary))
  cat("\n")
}
## [1] "AgeAll"
## [1] ""                                                                            
## [2] " Descriptive statistics by group "                                           
## [3] "group: Control"                                                              
## [4] "   vars    n  mean   sd median trimmed  mad min max range  skew kurtosis  se"
## [5] "X1    1 6057 81.92 7.75     82   82.09 7.41  60 106    46 -0.19      0.1 0.1"
## [6] "------------------------------------------------------------ "               
## [7] "group: AD"                                                                   
## [8] "   vars    n mean   sd median trimmed mad min max range  skew kurtosis   se" 
## [9] "X1    1 2659   81 7.95     81   81.24 8.9  60 109    49 -0.24    -0.21 0.15" 
## 
## [1] "AgeAD"
##  [1] ""                                                                             
##  [2] " Descriptive statistics by group "                                            
##  [3] "group: BRISTOL"                                                               
##  [4] "   vars   n  mean   sd median trimmed mad min max range skew kurtosis   se"   
##  [5] "X1    1 210 80.33 8.59     81   80.77 8.9  60  99    39 -0.4     -0.3 0.59"   
##  [6] "------------------------------------------------------------ "                
##  [7] "group: MADRID"                                                                
##  [8] "   vars   n  mean  sd median trimmed mad min max range skew kurtosis   se"    
##  [9] "X1    1 232 75.09 9.8     74   74.06 8.9  60 109    49 0.96     0.74 0.64"    
## [10] "------------------------------------------------------------ "                
## [11] "group: NOTTINGHAM"                                                            
## [12] "   vars   n  mean  sd median trimmed  mad min max range  skew kurtosis   se"  
## [13] "X1    1 253 82.74 8.3     84   83.11 7.41  61 101    40 -0.41    -0.15 0.52"  
## [14] "------------------------------------------------------------ "                
## [15] "group: OPTIMA"                                                                
## [16] "   vars   n  mean   sd median trimmed mad min max range  skew kurtosis   se"  
## [17] "X1    1 268 79.29 8.04     80   79.59 8.9  60 100    40 -0.26     -0.5 0.49"  
## [18] "------------------------------------------------------------ "                
## [19] "group: OVIEDO"                                                                
## [20] "   vars   n  mean   sd median trimmed  mad min max range  skew kurtosis   se" 
## [21] "X1    1 256 76.85 5.76     77   76.89 5.93  63  94    31 -0.05    -0.17 0.36" 
## [22] "------------------------------------------------------------ "                
## [23] "group: SANTANDER"                                                             
## [24] "   vars   n  mean   sd median trimmed  mad min max range  skew kurtosis   se" 
## [25] "X1    1 226 77.35 6.59     78   77.58 5.93  60  98    38 -0.24     0.11 0.44" 
## [26] "------------------------------------------------------------ "                
## [27] "group: ROTTERDAM"                                                             
## [28] "   vars    n  mean   sd median trimmed  mad min max range  skew kurtosis   se"
## [29] "X1    1 1214 83.81 6.48     84   84.03 5.93  61 101    40 -0.34    -0.03 0.19"
## 
## [1] "AgeControl"
##  [1] ""                                                                              
##  [2] " Descriptive statistics by group "                                             
##  [3] "group: BRISTOL"                                                                
##  [4] "   vars  n  mean   sd median trimmed  mad min max range  skew kurtosis   se"   
##  [5] "X1    1 88 81.32 8.21   81.5   81.58 9.64  62  96    34 -0.22    -0.68 0.88"   
##  [6] "------------------------------------------------------------ "                 
##  [7] "group: MADRID"                                                                 
##  [8] "   vars   n  mean    sd median trimmed   mad min max range skew kurtosis   se" 
##  [9] "X1    1 278 75.94 10.57     74   75.11 11.12  60 104    44 0.64    -0.24 0.63" 
## [10] "------------------------------------------------------------ "                 
## [11] "group: NOTTINGHAM"                                                             
## [12] "   vars   n  mean    sd median trimmed   mad min max range  skew kurtosis   se"
## [13] "X1    1 148 78.43 10.13   78.5   78.42 11.12  60 104    44 -0.04    -0.84 0.83"
## [14] "------------------------------------------------------------ "                 
## [15] "group: OPTIMA"                                                                 
## [16] "   vars   n  mean   sd median trimmed  mad min max range skew kurtosis   se"   
## [17] "X1    1 208 79.76 7.42     80    79.9 7.41  62 100    38 -0.1    -0.18 0.51"   
## [18] "------------------------------------------------------------ "                 
## [19] "group: OVIEDO"                                                                 
## [20] "   vars  n  mean   sd median trimmed  mad min max range  skew kurtosis   se"   
## [21] "X1    1 87 73.24 4.57     74   73.46 4.45  62  87    25 -0.28     0.26 0.49"   
## [22] "------------------------------------------------------------ "                 
## [23] "group: SANTANDER"                                                              
## [24] "   vars   n  mean   sd median trimmed  mad min max range  skew kurtosis   se"  
## [25] "X1    1 106 80.95 7.56   81.5   81.03 8.15  62  99    37 -0.11     -0.3 0.73"  
## [26] "------------------------------------------------------------ "                 
## [27] "group: ROTTERDAM"                                                              
## [28] "   vars    n mean   sd median trimmed  mad min max range  skew kurtosis  se"   
## [29] "X1    1 5142 82.6 7.26     83   82.72 7.41  60 106    46 -0.14     0.22 0.1"   
## 
## [1] "AgeADRegion"
## [1] ""                                                                             
## [2] " Descriptive statistics by group "                                            
## [3] "group: N.Eur"                                                                 
## [4] "   vars    n  mean   sd median trimmed  mad min max range  skew kurtosis   se"
## [5] "X1    1 1945 82.67 7.41     83   83.05 7.41  60 101    41 -0.48     0.11 0.17"
## [6] "------------------------------------------------------------ "                
## [7] "group: Spain"                                                                 
## [8] "   vars   n  mean   sd median trimmed  mad min max range skew kurtosis   se"  
## [9] "X1    1 714 76.44 7.59     76   76.25 7.41  60 109    49 0.44      0.9 0.28"  
## 
## [1] "AgeControlRegion"
## [1] ""                                                                            
## [2] " Descriptive statistics by group "                                           
## [3] "group: N.Eur"                                                                
## [4] "   vars    n  mean   sd median trimmed  mad min max range  skew kurtosis  se"
## [5] "X1    1 5586 82.37 7.42     82   82.52 7.41  60 106    46 -0.18      0.2 0.1"
## [6] "------------------------------------------------------------ "               
## [7] "group: Spain"                                                                
## [8] "   vars   n  mean   sd median trimmed mad min max range skew kurtosis   se"  
## [9] "X1    1 471 76.57 9.44     75   76.03 8.9  60 104    44 0.54    -0.06 0.43"


1.7 Creating data subsets


We create a data subset for each of the regions.

for (region in levels(All$Region)) {
  subset <- subset(All, Region == region)
  assign(region, subset)
}

And also for each individual center.

for (centre in levels(All$Centre)) {
  subset <- subset(All, Centre == centre)
  centre <- stringr::str_to_title(centre)
  assign(centre, subset)
}

Now, that we’ve done that it would be interesting to also subset according to each one of the covariates.

 variables <- c("Age82", "Sex", "E4status")
for (variable in variables) {
  for (level in levels(All[[variable]])) {
    name <- paste(variable, level, sep = "_")
    subset <- All[All[[variable]] == level,]
    assign(name, subset)
  }
}

And also subset the covariates but by region.

for (region in levels(All$Region)) {
  Region <- get(region)
  for (variable in variables) {
    for (level in levels(Region[[variable]])) {
      name <- paste(region, variable, level, sep = "_")
      subset <- Region[Region[[variable]] == level,]
      assign(name, subset)
    }
  }
}

And of course, subsetting those same covariates but by center.

centres <- levels(All$Centre)
centres <- stringr::str_to_title(centres)

for (centre in centres) {
  Centre <- get(centre)
  for (variable in variables) {
    for (level in levels(All[[variable]])) {
      name <- paste(centre, variable, level, sep = "_")
      subset <- Centre[Centre[[variable]] == level,]
      assign(name, subset)
    }
  }
}


2 Analysis


2.1 Main effects


First, we select the datasets into which we want to perform the analysis and then all covariates we want to control for.

DATASETS <- c("All", "N.Eur", "Spain")
variables <- c("Sex", "E4status", "Age82", "Centre")

Then we reorder the datasets alphabetically and set Santander as the reference level in the Spanish instead of Madrid as the former has a higher N and also making sure the analysis are performed with respect to the Control reference level not the Diagnosed ones.

for (i in seq_along(DATASETS)) {
  assign(DATASETS[i], get(DATASETS[i])[,order(names(get(DATASETS[i])))])
}
Spain$Centre <- relevel(Spain$Centre, ref = 6)
All$Diag <- relevel(All$Diag, ref = "Control")

Finally we perform the main effects analysis for the selected SNPs.

master_list <- perform_analysis(.mode = "main_effects", .data = DATASETS, snps = list_of_objects, covariates = variables)

This function outputs a list with the results of the glm contained in it (best model of inheritance and glm results). An example of the structure of the list with only the main effects performed can be seen here, consisting in the first snp of the All (N.Eur + Spain) dataset:

## $rs5749131
## $rs5749131$Gene
## [1] "TCN2"
## 
## $rs5749131$Best_model
##                 rs5749131a 
## "[1] AIC quasi= 7840.0857" 
## 
## $rs5749131$Main_effects
## $rs5749131$Main_effects$rs5749131a
##                  Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)         0.662      0.151 1.938 1.441 2.606 1.205765e-05
## SexMale            -0.521      0.059 0.594 0.529 0.667 1.486778e-18
## E4statusE4+         1.124      0.059 3.077 2.742 3.452 6.303636e-80
## Age82>82            0.324      0.060 1.383 1.230 1.555 6.094967e-08
## CentreMADRID       -0.977      0.168 0.376 0.271 0.523 6.295820e-09
## CentreNOTTINGHAM   -0.401      0.175 0.670 0.475 0.944 2.194844e-02
## CentreOPTIMA       -0.659      0.168 0.517 0.372 0.719 8.641606e-05
## CentreOVIEDO        0.348      0.189 1.416 0.978 2.052 6.581899e-02
## CentreSANTANDER    -0.116      0.184 0.890 0.621 1.277 5.272912e-01
## CentreROTTERDAM    -2.434      0.142 0.088 0.066 0.116 2.672129e-64
## rs5749131a1        -0.123      0.063 0.884 0.781 1.000 5.056580e-02
## rs5749131a2        -0.237      0.083 0.789 0.671 0.928 4.239103e-03
## 
## $rs5749131$Main_effects$rs5749131d
##                  Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)         0.660      0.151 1.935 1.439 2.601 1.253322e-05
## SexMale            -0.522      0.059 0.594 0.529 0.666 1.349425e-18
## E4statusE4+         1.124      0.059 3.077 2.742 3.451 4.833744e-80
## Age82>82            0.325      0.060 1.384 1.231 1.556 5.622751e-08
## CentreMADRID       -0.973      0.168 0.378 0.272 0.525 7.167167e-09
## CentreNOTTINGHAM   -0.399      0.175 0.671 0.476 0.945 2.241794e-02
## CentreOPTIMA       -0.656      0.168 0.519 0.374 0.721 9.325110e-05
## CentreOVIEDO        0.349      0.189 1.417 0.978 2.053 6.526792e-02
## CentreSANTANDER    -0.114      0.184 0.892 0.622 1.279 5.348744e-01
## CentreROTTERDAM    -2.433      0.142 0.088 0.066 0.116 2.710091e-64
## rs5749131d1        -0.154      0.060 0.857 0.763 0.964 9.900838e-03
## 
## $rs5749131$Main_effects$rs5749131r
##                  Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)         0.592      0.147 1.807 1.356 2.409 5.525052e-05
## SexMale            -0.521      0.059 0.594 0.529 0.667 1.459895e-18
## E4statusE4+         1.123      0.059 3.073 2.739 3.447 8.260309e-80
## Age82>82            0.324      0.060 1.382 1.229 1.554 6.461236e-08
## CentreMADRID       -0.983      0.168 0.374 0.269 0.520 5.101118e-09
## CentreNOTTINGHAM   -0.404      0.175 0.667 0.474 0.940 2.081654e-02
## CentreOPTIMA       -0.667      0.168 0.513 0.369 0.713 7.047623e-05
## CentreOVIEDO        0.347      0.189 1.414 0.976 2.049 6.696676e-02
## CentreSANTANDER    -0.125      0.184 0.883 0.616 1.265 4.966989e-01
## CentreROTTERDAM    -2.436      0.142 0.088 0.066 0.116 1.803317e-64
## rs5749131r1        -0.163      0.074 0.850 0.735 0.982 2.726669e-02

We can appreciate that the main effects analysis were also performed for the Spanish and Northern European subsets.

names(master_list)
## [1] "All"   "N.Eur" "Spain"

Then, selecting a threshold of 0.05, before applying multiple testing correction, we can obtain the following results for the snps under the threshold and to which genes it belongs.

print_significant_results(master_list)
## 
##  rs5749131a2  rs5753231r1  rs1131603d1  rs5997711r1 rs11030102d1 rs11030119d1 
##        0.004        0.046        0.049        0.010        0.001        0.032 
## rs12288512d1   rs429358a1   rs429358a2     rs7412d1  rs4945261d1  rs2373115d1 
##        0.014        0.000        0.000        0.000        0.025        0.030 
##  rs2695121d1  rs1052533r1   rs744373a1   rs744373a2 
##        0.041        0.038        0.042        0.000 
## 
## 
## sig_genes
##  TCN2  APOE  BDNF  BIN1  GAB2 NR1H2 
##     4     3     3     2     2     2
print_significant_results(master_list, corrected = T)
## 
## rs11030102d1   rs429358a1   rs429358a2     rs7412d1   rs744373a2 
##        0.037        0.001        0.000        0.001        0.002 
## 
## 
## sig_genes
## APOE BDNF BIN1 
##    3    1    1

Here, we can see that both additive levels for snp rs429358 remain significant, so what we really have are 4 snp models that are found significantly associated with AD, even after multiple testing correction.

Those are the APOE gene, for which has the E4 variant was found to be the largest known genetic risk factor for late-onset sporadic AD. The rationale is that this isoform APOE-ε4 is not as effective as the others at promoting these reactions, resulting in increased vulnerability to AD in individuals with that gene variation

The BDNF snp (rs11030102) has been linked to decreased BDNF serum levels. In the adult brain, BDNF maintains high expression levels and regulates both excitatory and inhibitory synaptic transmission and activity-dependent plasticity (Miranda et al., 2019)

The bridging integrator 1 (BIN1) gene is the second most important susceptibility gene for late-onset Alzheimer’s disease (LOAD) after apolipoprotein E (APOE) gene. (Hu et al., 2021). BIN1 encodes a Myc-interacting protein but its role in AD is still unclear.

To see a count of the number of genes studied, according to the number of snps we can use the following code:

genes_studied <- sapply(master_list$All, function(snp) {snp$Gene})
genes_studied <- unname(genes_studied)
genes_studied <- table(genes_studied)
sort(genes_studied, decreasing = T)
## genes_studied
##    TCN2    MTRR   SORT1    BDNF   SHMT1    RFC1    APOE   ABCA1 CYP46A1     DBH 
##      11      10       6       5       5       4       3       2       2       2 
##    GAB2    MAPT    NPC1   NR1H2   NTRK2    PEMT     VDR    BIN1    CD33    CDK5 
##       2       2       2       2       2       2       2       1       1       1 
##   HMDX1   HMGCR    IL10    LRP1 MTHFD1L    PRNP SLC19A1   TRAK2 
##       1       1       1       1       1       1       1       1

Here we can see that from the 75 SNPs studied, all 3 APOE snps were found significant as well as the only SNP studied for BIN1. For BDNF, though just one out of five SNPs was deemed significant according to main effects, snp rs11030102.

To measure the strength of the association found we can extract the OR from the main effects results.

significant_snps <- c("rs11030102d1","rs429358a1", "rs429358a2", "rs7412d1","rs744373a2")
OR_results <- sapply(significant_snps, function(snp_model_lvl){
  snp_model <- sub("[0-2]$", "", snp_model_lvl);
  snp <- sub("[a-z]$", "", snp_model);
  glm_results <- master_list$All[[snp]]$Main_effects[[snp_model]]
  index <- grep(rownames(glm_results), pattern = snp_model_lvl)
  glm_results[index,][3]
})
snps <- sub(names(OR_results), pattern = "[a-z][0-2].OR$", replacement = "")
genes <- sapply(snps, function(snp) {list_of_objects[[snp]]$gene})
OR_results2 <- OR_results
names(OR_results2) <- genes
sort(OR_results, decreasing = T)
##   rs429358a2.OR   rs429358a1.OR   rs744373a2.OR rs11030102d1.OR     rs7412d1.OR 
##           8.526           1.777           1.491           1.203           0.681
sort(OR_results2, decreasing = T)
##  APOE  APOE  BIN1  BDNF  APOE 
## 8.526 1.777 1.491 1.203 0.681

Thus, we can see that the additive model for snp rs429358, which corresponds to the APOE gene increases the odds of having 1.7 times with respect no not having the variant allele, while the full dosage increases it 8 times.

On the other hand, the others, hold more moderate effects, with the dominant model of snp rs7412d, which also corresponds to APOE, increases the odds of having the disease 0.682. Or what is the same, decreases it by 1/0.682 = 1.4662757

We can generate a dataframe to plot these results:

plot_df <- data.frame(matrix(NA, nrow = length(significant_snps), ncol = 6))
colnames(plot_df) <- c("snp", "gene", "OR", "lower", "upper", "pvalue")
for (i in seq_along(significant_snps)) {
  snp_model_lvl <- significant_snps[i]
  snp_model <- sub(snp_model_lvl, pattern = "[0-2]$", replacement = "")
  snp <- sub(snp_model, pattern = "[a-z]$", replacement = "")
  SNP <- master_list$All[[snp]]
  gene <- SNP$Gene
  main_effects <- SNP$Main_effects[[snp_model]]
  snp_term <- grep(pattern = snp_model_lvl, x = rownames(main_effects))
  OR <- main_effects[snp_term, 3]
  lower <- main_effects[snp_term, 4]
  upper <- main_effects[snp_term, 5]
  pvalue <- main_effects[snp_term, 6]
  plot_df[i,1] <- snp_model_lvl
  plot_df[i,2] <- gene
  plot_df[i,3] <- OR
  plot_df[i,4] <- lower
  plot_df[i,5] <- upper
  plot_df[i,6] <- pvalue
}

And plot them:

## Warning: geom_vline(): Ignoring `mapping` because `xintercept` was provided.


2.2 Covariate interaction


The next step in the analysis would be to examine all possible interactions of the 75 studied SNPs with the covariates Age82, Sex & E4status.

In practice this means repeating the models from the main effects but by adding one interaction term at a time.

In order to reduce the multiple testing performed and due to the previous findings of what the best models were, only the best snp models chosen for each main effect were employed in this subsequent step.

master_list <- perform_analysis(.mode = "interaction", .data = DATASETS, snps = list_of_objects, covariates = variables)

And here we can see an example of the kind of results obtained by doing this analysis.

master_list$All[[1]]$Interactions$Other_covariates
## $Age75
## $Age75$Name
## NULL
## 
## $Age75$Summary
## NULL
## 
## $Age75$SF
## NULL
## 
## $Age75$Significant
## NULL
## 
## 
## $Sex
## $Sex$Name
## [1] "rs5749131a1:SexMale" "rs5749131a2:SexMale"
## 
## $Sex$Summary
##                     Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)            0.669      0.154 1.953 1.443 2.644 1.473689e-05
## E4statusE4+            1.124      0.059 3.076 2.742 3.451 6.757858e-80
## Age82>82               0.325      0.060 1.384 1.231 1.557 5.580781e-08
## CentreMADRID          -0.980      0.168 0.375 0.270 0.522 5.618590e-09
## CentreNOTTINGHAM      -0.406      0.175 0.666 0.473 0.939 2.036191e-02
## CentreOPTIMA          -0.664      0.168 0.515 0.370 0.715 7.574688e-05
## CentreOVIEDO           0.343      0.189 1.410 0.973 2.042 6.971030e-02
## CentreSANTANDER       -0.118      0.184 0.888 0.620 1.274 5.202003e-01
## CentreROTTERDAM       -2.438      0.142 0.087 0.066 0.115 1.603023e-64
## rs5749131a1           -0.103      0.080 0.902 0.771 1.055 1.966077e-01
## rs5749131a2           -0.318      0.106 0.728 0.591 0.896 2.805680e-03
## SexMale               -0.532      0.101 0.587 0.482 0.716 1.505749e-07
## rs5749131a1:SexMale   -0.053      0.130 0.949 0.735 1.224 6.850088e-01
## rs5749131a2:SexMale    0.206      0.170 1.229 0.881 1.713 2.242916e-01
## 
## $Sex$SF
## [1] 0.949 1.229
## 
## $Sex$Significant
## [1] FALSE FALSE
## 
## 
## $E4status
## $E4status$Name
## [1] "rs5749131a1:E4statusE4+" "rs5749131a2:E4statusE4+"
## 
## $E4status$Summary
##                         Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)                0.684      0.154 1.983 1.465 2.684 9.529236e-06
## SexMale                   -0.520      0.059 0.594 0.529 0.667 1.791717e-18
## Age82>82                   0.324      0.060 1.382 1.229 1.554 6.619788e-08
## CentreMADRID              -0.980      0.168 0.375 0.270 0.522 5.775270e-09
## CentreNOTTINGHAM          -0.404      0.175 0.668 0.474 0.941 2.096352e-02
## CentreOPTIMA              -0.663      0.168 0.515 0.371 0.716 7.787719e-05
## CentreOVIEDO               0.345      0.189 1.412 0.975 2.046 6.825821e-02
## CentreSANTANDER           -0.115      0.184 0.892 0.622 1.278 5.324574e-01
## CentreROTTERDAM           -2.436      0.142 0.088 0.066 0.116 1.938331e-64
## rs5749131a1               -0.176      0.080 0.839 0.718 0.981 2.740924e-02
## rs5749131a2               -0.210      0.104 0.811 0.661 0.995 4.463333e-02
## E4statusE4+                1.067      0.102 2.907 2.382 3.548 1.315826e-25
## rs5749131a1:E4statusE4+    0.139      0.131 1.149 0.890 1.485 2.862701e-01
## rs5749131a2:E4statusE4+   -0.068      0.171 0.934 0.668 1.305 6.891326e-01
## 
## $E4status$SF
## [1] 1.149 0.934
## 
## $E4status$Significant
## [1] FALSE FALSE
## 
## 
## $Age82
## $Age82$Name
## [1] "rs5749131a1:Age82>82" "rs5749131a2:Age82>82"
## 
## $Age82$Summary
##                      Estimate Std. Error    OR lower upper     Pr(>|t|)
## (Intercept)             0.690      0.156 1.993 1.467 2.708 1.052890e-05
## SexMale                -0.523      0.059 0.593 0.528 0.665 1.127698e-18
## E4statusE4+             1.125      0.059 3.079 2.744 3.455 6.524475e-80
## CentreMADRID           -0.982      0.168 0.375 0.270 0.521 5.529357e-09
## CentreNOTTINGHAM       -0.406      0.175 0.666 0.473 0.939 2.053118e-02
## CentreOPTIMA           -0.665      0.168 0.514 0.370 0.715 7.551328e-05
## CentreOVIEDO            0.341      0.189 1.406 0.970 2.038 7.184415e-02
## CentreSANTANDER        -0.119      0.184 0.888 0.619 1.274 5.190076e-01
## CentreROTTERDAM        -2.441      0.143 0.087 0.066 0.115 1.541780e-64
## rs5749131a1            -0.196      0.092 0.822 0.686 0.984 3.271840e-02
## rs5749131a2            -0.157      0.119 0.855 0.677 1.079 1.867526e-01
## Age82>82                0.285      0.099 1.329 1.094 1.615 4.248406e-03
## rs5749131a1:Age82>82    0.138      0.126 1.148 0.896 1.471 2.748867e-01
## rs5749131a2:Age82>82   -0.157      0.166 0.855 0.617 1.184 3.455101e-01
## 
## $Age82$SF
## [1] 1.148 0.855
## 
## $Age82$Significant
## [1] FALSE FALSE

First, for the variable “Name” we have the interaction studied, then as “Summary” the full glm model results obtained, and the rest of variables, extract the most interesting results for the interaction, the “SF” and whether it is found “Significant” or not.

The SF is a statistic used to measure interactions in complex diseases. It is defined as the OR of the interaction divided by the product of the OR of each of the interaction on its own.

Thus, it makes a good and easy to understand metric for what we are measuring as if there was no interaction effect we would expect the SF to be close to 1, as the impact of the interaction should be about the same as the effect of each of the members on its on. However, if the interaction is found greater or lower than 1, that means we are having a synergistic effect in which the odds on having the disease is increased or reduced respectively.

Having said that, we can obtain the significant results found for this snp-covariate interaction measured: